001    /***********************************************************************************************
002     *              Tekijä:                 Jukka Salminen
003     *              Opiskelijanumero:       i79947
004     *              Email:                  jukka.salminen@uwasa.fi
005     *              Tekoaika:               23.9.2002
006     *              Kurssi/vuosi:           Ohjelmoinnin jatkokurssi/kevät 2002
007     *              Työn tunnus:            Harjoitustyö: Taulukkoeditori web-sivulle
008     ***********************************************************************************************
009     * QDParser-luokka
010     * ===============
011     * Lähde: www.javaworld.com
012     *http://www.javaworld.com/javaworld/javatips/jw-javatip128.html
013     * Quick and Dirty xml parser.  This parser is, like the SAX parser,
014     * an event based parser, but with much less functionality.  */
015    package jsxml.qdxml;
016    import java.io.*;
017    import java.util.*;
018    
019    public class QDParser
020    {
021            private static int popMode(Stack st)
022            {
023                    if(!st.empty())
024                            return ((Integer)st.pop()).intValue();
025                    else
026                            return PRE;
027            }
028            private final static int
029            TEXT = 1,
030            ENTITY = 2,
031            OPEN_TAG = 3,
032            CLOSE_TAG = 4,
033            START_TAG = 5,
034            ATTRIBUTE_LVALUE = 6,
035            ATTRIBUTE_EQUAL = 9,
036            ATTRIBUTE_RVALUE = 10,
037            QUOTE = 7,
038            IN_TAG = 8,
039            SINGLE_TAG = 12,
040            COMMENT = 13,
041            DONE = 11,
042            DOCTYPE = 14,
043            PRE = 15,
044            CDATA = 16;
045            public static void parse(DocHandler doc,Reader r) throws Exception
046            {
047                    Stack st = new Stack();
048                    int depth = 0;
049                    int mode = PRE;
050                    int c = 0;
051                    int quotec = '"';
052                    depth = 0;
053                    StringBuffer sb = new StringBuffer();
054                    StringBuffer etag = new StringBuffer();
055                    String tagName = null;
056                    String lvalue = null;
057                    String rvalue = null;
058                    Hashtable attrs = null;
059                    st = new Stack();
060                    doc.startDocument();
061                    int line=1, col=0;
062                    boolean eol = false;
063                    String nextchar = new String("");
064                    while((c = r.read()) != -1)
065                    {
066                            
067                            // We need to map \r, \r\n, and \n to \n
068                            // See XML spec section 2.11
069                            if(c == '\n' && eol)
070                            {
071                                    eol = false;
072                                    continue;
073                            } else if(eol)
074                            {
075                                    eol = false;
076                            } else if(c == '\n')
077                            {
078                                    line++;
079                                    col=0;
080                            } else if(c == '\r')
081                            {
082                                    eol = true;
083                                    c = '\n';
084                                    line++;
085                                    col=0;
086                            } else
087                            {
088                                    col++;
089                            }
090                            
091                            if(mode == DONE)
092                            {
093                                    doc.endDocument();
094                                    return;
095                                    
096                                    // We are between tags collecting text.
097                            } else if(mode == TEXT)
098                            {
099                                    if(c == '<')
100                                    {
101                                            st.push(new Integer(mode));
102                                            mode = START_TAG;
103                                            if(sb.length() > 0)
104                                            {
105                                                    doc.text(sb.toString());
106                                                    sb.setLength(0);
107                                            }
108                                    } else if(c == '&')
109                                    {
110                                            st.push(new Integer(mode));
111                                            mode = ENTITY;
112                                            etag.setLength(0);
113                                    } else
114                                            sb.append((char)c);
115                                    
116                                    // we are processing a closing tag: e.g. </foo>
117                            } else if(mode == CLOSE_TAG)
118                            {
119                                    if(c == '>')
120                                    {
121                                            mode = popMode(st);
122                                            tagName = sb.toString();
123                                            sb.setLength(0);
124                                            depth--;
125                                            if(depth==0)
126                                                    mode = DONE;
127                                            doc.endElement(tagName);
128                                    } else
129                                    {
130                                            sb.append((char)c);
131                                    }
132                                    
133                                    // we are processing CDATA
134                            } else if(mode == CDATA)
135                            {
136                                    if(c == '>'
137                                    && sb.toString().endsWith("]]"))
138                                    {
139                                            sb.setLength(sb.length()-2);
140                                            doc.text(sb.toString());
141                                            sb.setLength(0);
142                                            mode = popMode(st);
143                                    } else
144                                            sb.append((char)c);
145                                    
146                                    // we are processing a comment.  We are inside
147                                    // the <!-- .... --> looking for the -->.
148                            } else if(mode == COMMENT)
149                            {
150                                    if(c == '>'
151                                    && sb.toString().endsWith("--"))
152                                    {
153                                            sb.setLength(0);
154                                            mode = popMode(st);
155                                    } else
156                                            sb.append((char)c);
157                                    
158                                    // We are outside the root tag element
159                            } else if(mode == PRE)
160                            {
161                                    if(c == '<')
162                                    {
163                                            mode = TEXT;
164                                            st.push(new Integer(mode));
165                                            mode = START_TAG;
166                                    }
167                                    
168                                    // We are inside one of these <? ... ?>
169                                    // or one of these <!DOCTYPE ... >
170                            } else if(mode == DOCTYPE)
171                            {
172                                    if(c == '>')
173                                    {
174                                            mode = popMode(st);
175                                            if(mode == TEXT) mode = PRE;
176                                    }
177                                    
178                                    // we have just seen a < and
179                                    // are wondering what we are looking at
180                                    // <foo>, </foo>, <!-- ... --->, etc.
181                            } else if(mode == START_TAG)
182                            {
183                                    mode = popMode(st);
184                                    if(c == '/')
185                                    {
186                                            st.push(new Integer(mode));
187                                            mode = CLOSE_TAG;
188                                    } else if (c == '?')
189                                    {
190                                            mode = DOCTYPE;
191                                    } else
192                                    {
193                                            st.push(new Integer(mode));
194                                            mode = OPEN_TAG;
195                                            tagName = null;
196                                            attrs = new Hashtable();
197                                            sb.append((char)c);
198                                    }
199                                    
200                                    // we are processing an entity, e.g. <, », etc.
201                            } else if(mode == ENTITY)
202                            {
203                                    if(c == ';')
204                                    {
205                                            mode = popMode(st);
206                                            String cent = etag.toString();
207                                            etag.setLength(0);
208                                            if(cent.equals("lt"))
209                                                    sb.append('<');
210                                            else if(cent.equals("gt"))
211                                                    sb.append('>');
212                                            else if(cent.equals("amp"))
213                                                    sb.append('&');
214                                            else if(cent.equals("quot"))
215                                                    sb.append('"');
216                                            else if(cent.equals("apos"))
217                                                    sb.append('\'');
218                                            // Could parse hex entities if we wanted to
219                                            //else if(cent.startsWith("#x"))
220                                            //sb.append((char)Integer.parseInt(cent.substring(2),16));
221                                            else if(cent.startsWith("#"))
222                                                    sb.append((char)Integer.parseInt(cent.substring(1)));
223                                            // Insert custom entity definitions here
224                                            else
225                                                    exc("Unknown entity: &"+cent+";",line,col);
226                                    } else
227                                    {
228                                            etag.append((char)c);
229                                    }
230                                    
231                                    // we have just seen something like this:
232                                    // <foo a="b"/
233                                    // and are looking for the final >.
234                            } else if(mode == SINGLE_TAG)
235                            {
236                                    if(tagName == null)
237                                            tagName = sb.toString();
238                                    if(c != '>')
239                                            exc("Expected > for tag: <"+tagName+"/>",line,col);
240                                    doc.startElement(tagName,attrs);
241                                    doc.endElement(tagName);
242                                    if(depth==0)
243                                    {
244                                            doc.endDocument();
245                                            return;
246                                    }
247                                    sb.setLength(0);
248                                    attrs = new Hashtable();
249                                    tagName = null;
250                                    mode = popMode(st);
251                                    
252                                    // we are processing something
253                                    // like this <foo ... >.  It could
254                                    // still be a <!-- ... --> or something.
255                            } else if(mode == OPEN_TAG)
256                            {
257                                    if(c == '>')
258                                    {
259                                            if(tagName == null)
260                                                    tagName = sb.toString();
261                                            sb.setLength(0);
262                                            depth++;
263                                            doc.startElement(tagName,attrs);
264                                            tagName = null;
265                                            attrs = new Hashtable();
266                                            mode = popMode(st);
267                                    } else if(c == '/')
268                                    {
269                                            mode = SINGLE_TAG;
270                                    } else if(c == '-' && sb.toString().equals("!-"))
271                                    {
272                                            mode = COMMENT;
273                                    } else if(c == '[' && sb.toString().equals("![CDATA"))
274                                    {
275                                            mode = CDATA;
276                                            sb.setLength(0);
277                                    } else if(c == 'E' && sb.toString().equals("!DOCTYP"))
278                                    {
279                                            sb.setLength(0);
280                                            mode = DOCTYPE;
281                                    } else if(Character.isWhitespace((char)c))
282                                    {
283                                            tagName = sb.toString();
284                                            sb.setLength(0);
285                                            mode = IN_TAG;
286                                    } else
287                                    {
288                                            sb.append((char)c);
289                                    }
290                                    
291                                    // We are processing the quoted right-hand side
292                                    // of an element's attribute.
293                            } else if(mode == QUOTE)
294                            {
295                                    if(c == quotec)
296                                    {
297                                            rvalue = sb.toString();
298                                            sb.setLength(0);
299                                            attrs.put(lvalue,rvalue);
300                                            mode = IN_TAG;
301                                            // See section the XML spec, section 3.3.3
302                                            // on normalization processing.
303                                    } else if(" \r\n\u0009".indexOf(c)>=0)
304                                    {
305                                            sb.append(' ');
306                                    } else if(c == '&')
307                                    {
308                                            st.push(new Integer(mode));
309                                            mode = ENTITY;
310                                            etag.setLength(0);
311                                    } else
312                                    {
313                                            sb.append((char)c);
314                                    }
315                                    
316                            } else if(mode == ATTRIBUTE_RVALUE)
317                            {
318                                    if(c == '"' || c == '\'')
319                                    {
320                                            quotec = c;
321                                            mode = QUOTE;
322                                    } else if(Character.isWhitespace((char)c))
323                                    {
324                                            ;
325                                    } else
326                                    {
327                                            exc("Error in attribute processing",line,col);
328                                    }
329                                    
330                            } else if(mode == ATTRIBUTE_LVALUE)
331                            {
332                                    if(Character.isWhitespace((char)c))
333                                    {
334                                            lvalue = sb.toString();
335                                            sb.setLength(0);
336                                            mode = ATTRIBUTE_EQUAL;
337                                    } else if(c == '=')
338                                    {
339                                            lvalue = sb.toString();
340                                            sb.setLength(0);
341                                            mode = ATTRIBUTE_RVALUE;
342                                    } else
343                                    {
344                                            sb.append((char)c);
345                                    }
346                                    
347                            } else if(mode == ATTRIBUTE_EQUAL)
348                            {
349                                    if(c == '=')
350                                    {
351                                            mode = ATTRIBUTE_RVALUE;
352                                    } else if(Character.isWhitespace((char)c))
353                                    {
354                                            ;
355                                    } else
356                                    {
357                                            exc("Error in attribute processing.",line,col);
358                                    }
359                                    
360                            } else if(mode == IN_TAG)
361                            {
362                                    if(c == '>')
363                                    {
364                                            mode = popMode(st);
365                                            doc.startElement(tagName,attrs);
366                                            depth++;
367                                            tagName = null;
368                                            attrs = new Hashtable();
369                                    } else if(c == '/')
370                                    {
371                                            mode = SINGLE_TAG;
372                                    } else if(Character.isWhitespace((char)c))
373                                    {
374                                            ;
375                                    } else
376                                    {
377                                            mode = ATTRIBUTE_LVALUE;
378                                            sb.append((char)c);
379                                    }
380                            }
381                    }
382                    if(mode == DONE)
383                            doc.endDocument();
384                    else
385                            exc("missing end tag",line,col);
386            }
387            private static void exc(String s,int line,int col)
388            throws Exception
389            {
390                    throw new Exception(s+" near line "+line+", column "+col);
391            }
392    }